library(GEOquery)
library(oligo)
# library(affy)
library(limma)
library(sva)
library(tidyverse)
library(factoextra)
library(pheatmap)
library(caret)
library(RColorBrewer)
# library(viridis)
library(ggsci)
# given a matrix, perform min-max scaling on its columns
min_max_mat <- function(mat){
mat_rescaled <- apply(mat, 2, function(v){
v_range <- range(v)
names(v_range) <- c("minimum", "maximum")
range_difference <- v_range["maximum"] - v_range["minimum"]
rescaled <- (v - v_range["minimum"])/range_difference
return(rescaled)
})
return(mat_rescaled)
}
# geodata <- GEOquery::getGEO(GEO = "GSE76275", destdir = "./tempfiles")
# geodata <- GEOquery::getGEO(filename = "./tempfiles/GSE76275_series_matrix.txt.gz")
# saveRDS(geodata, "geodata.RDS")
geodata <- readRDS("geodata.RDS")
# mdata <- geodata %>%
# pluck(1) %>%
# phenoData() %>%
# pData() %>% as_tibble()
# feature_data <- geodata %>%
# pluck(1) %>%
# featureData()
# write_csv(mdata, "raw_mdata.csv")
mdata <- read_csv("raw_mdata.csv")
Rows: 265 Columns: 69
── Column specification ──────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (62): title, geo_accession, status, submission_date, last_update_date, type, source_name_ch1, organism...
dbl (6): channel_count, taxid_ch1, contact_zip/postal_code, data_row_count, age (years):ch1, body mass in...
lgl (1): growth_protocol_ch1
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# saveRDS(feature_data, "featureData.RDS")
# feature_data <- readRDS("featureData.RDS")
mdata %>%
glimpse()
Rows: 265
Columns: 69
$ title <chr> "S1-H10", "S1-H14", "S1-H19", "S1-H20B", "S1-H22", "S1-H27", "S…
$ geo_accession <chr> "GSM1974566", "GSM1974567", "GSM1974568", "GSM1974569", "GSM197…
$ status <chr> "Public on Dec 18 2015", "Public on Dec 18 2015", "Public on De…
$ submission_date <chr> "Dec 17 2015", "Dec 17 2015", "Dec 17 2015", "Dec 17 2015", "De…
$ last_update_date <chr> "Dec 18 2015", "Dec 18 2015", "Dec 18 2015", "Dec 18 2015", "De…
$ type <chr> "RNA", "RNA", "RNA", "RNA", "RNA", "RNA", "RNA", "RNA", "RNA", …
$ channel_count <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ source_name_ch1 <chr> "Human breast cancer", "Human breast cancer", "Human breast can…
$ organism_ch1 <chr> "Homo sapiens", "Homo sapiens", "Homo sapiens", "Homo sapiens",…
$ characteristics_ch1 <chr> "tissue: Breast cancer", "tissue: Breast cancer", "tissue: Brea…
$ characteristics_ch1.1 <chr> "gender: Female", "age (years): 41", "age (years): 55", "age (y…
$ characteristics_ch1.2 <chr> "race: Caucasian", "gender: Female", "gender: Female", "gender:…
$ characteristics_ch1.3 <chr> "body mass index: 32", "race: Caucasian", "race: Caucasian", "r…
$ characteristics_ch1.4 <chr> "menopausal status: Post-Menopausal", "body mass index: 29", "h…
$ characteristics_ch1.5 <chr> "histology group: Infiltrating Ductal Carcinoma", "menopausal s…
$ characteristics_ch1.6 <chr> "histology: Infiltrating Ductal Carcinoma", "histology group: I…
$ characteristics_ch1.7 <chr> "ajcc stage (7th edition, 2010): T2N1M0", "histology: Infiltrat…
$ characteristics_ch1.8 <chr> "set: Validation TN", "ajcc stage (7th edition, 2010): T1N0M0",…
$ characteristics_ch1.9 <chr> "er: Negative", "set: Validation TN", "pr: Negative", "set: Val…
$ characteristics_ch1.10 <chr> "pr: Negative", "er: Negative", "her2: Negative", "er: Negative…
$ characteristics_ch1.11 <chr> "her2: Negative", "pr: Negative", "triple-negative status: TN",…
$ characteristics_ch1.12 <chr> "triple-negative status: TN", "her2: Negative", "tumor grade: P…
$ characteristics_ch1.13 <chr> "tumor size: 2 - 5 cm", "triple-negative status: TN", "tumor si…
$ characteristics_ch1.14 <chr> "positive nodes: 1 - 3", "tumor grade: Poorly Differentiated", …
$ characteristics_ch1.15 <chr> "metastases: No mets", "tumor size: <=2cm", "metastases: No met…
$ characteristics_ch1.16 <chr> "tnbc subtype: Mesenchymal (MES)", "positive nodes: 0", "tnbc s…
$ characteristics_ch1.17 <chr> NA, "metastases: No mets", NA, "metastases: No mets", NA, NA, N…
$ characteristics_ch1.18 <chr> NA, "tnbc subtype: Basal-Like Immune-Activated (BLIA)", NA, "tn…
$ treatment_protocol_ch1 <chr> "breast tumor samples, flash frozen", "breast tumor samples, fl…
$ growth_protocol_ch1 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ molecule_ch1 <chr> "total RNA", "total RNA", "total RNA", "total RNA", "total RNA"…
$ extract_protocol_ch1 <chr> "Trizol extraction of total RNA was performed according to the …
$ label_ch1 <chr> "biotin", "biotin", "biotin", "biotin", "biotin", "biotin", "bi…
$ label_protocol_ch1 <chr> "3' IVT express", "3' IVT express", "3' IVT express", "3' IVT e…
$ taxid_ch1 <dbl> 9606, 9606, 9606, 9606, 9606, 9606, 9606, 9606, 9606, 9606, 960…
$ hyb_protocol <chr> "Standard Affymetix protocol for 3' IVT design", "Standard Affy…
$ scan_protocol <chr> "Scanner GCS3000 7G", "Scanner GCS3000 7G", "Scanner GCS3000 7G…
$ description <chr> "log2 gene expression data", "log2 gene expression data", "log2…
$ data_processing <chr> "The data were analyzed with Affy package in R, RMA method", "…
$ data_processing.1 <chr> "Expressions estimated using RMA signal method", "Expressions e…
$ platform_id <chr> "GPL570", "GPL570", "GPL570", "GPL570", "GPL570", "GPL570", "GP…
$ contact_name <chr> "Suzanne,,Fuqua", "Suzanne,,Fuqua", "Suzanne,,Fuqua", "Suzanne,…
$ contact_institute <chr> "Baylor College of Medicine", "Baylor College of Medicine", "Ba…
$ contact_address <chr> "1 Baylor Plaza", "1 Baylor Plaza", "1 Baylor Plaza", "1 Baylor…
$ contact_city <chr> "Houston", "Houston", "Houston", "Houston", "Houston", "Houston…
$ contact_state <chr> "Texas", "Texas", "Texas", "Texas", "Texas", "Texas", "Texas", …
$ `contact_zip/postal_code` <dbl> 77030, 77030, 77030, 77030, 77030, 77030, 77030, 77030, 77030, …
$ contact_country <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", …
$ supplementary_file <chr> "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1974nnn/GSM1974566/s…
$ data_row_count <dbl> 54675, 54675, 54675, 54675, 54675, 54675, 54675, 54675, 54675, …
$ `age (years):ch1` <dbl> NA, 41, 55, 55, 65, 40, 66, 57, NA, 55, 44, 78, 54, 71, 37, 51,…
$ `ajcc stage (7th edition, 2010):ch1` <chr> "T2N1M0", "T1N0M0", "T2N0M0", "T1cN0M0", "T2N2MX", "T1NXMX", "T…
$ `body mass index:ch1` <dbl> 32, 29, NA, 31, 38, 22, 22, 38, 29, 32, 44, 28, 34, 32, 23, 23,…
$ `er:ch1` <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Ne…
$ `gender:ch1` <chr> "Female", "Female", "Female", "Female", "Female", "Female", "Fe…
$ `her2:ch1` <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Ne…
$ `histology group:ch1` <chr> "Infiltrating Ductal Carcinoma", "Infiltrating Ductal Carcinoma…
$ `histology:ch1` <chr> "Infiltrating Ductal Carcinoma", "Infiltrating Ductal Carcinoma…
$ `menopausal status:ch1` <chr> "Post-Menopausal", "Post-Menopausal", NA, "Post-Menopausal", "P…
$ `metastases:ch1` <chr> "No mets", "No mets", "No mets", "No mets", NA, NA, NA, NA, "No…
$ `positive nodes:ch1` <chr> "1 - 3", "0", "0", "0", "4-9", NA, NA, "0", "0", "4-9", "0", "1…
$ `pr:ch1` <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Ne…
$ `race:ch1` <chr> "Caucasian", "Caucasian", "Caucasian", "Caucasian", "Caucasian"…
$ `set:ch1` <chr> "Validation TN", "Validation TN", "Validation TN", "Validation …
$ `tissue:ch1` <chr> "Breast cancer", "Breast cancer", "Breast cancer", "Breast canc…
$ `tnbc subtype:ch1` <chr> "Mesenchymal (MES)", "Basal-Like Immune-Activated (BLIA)", "Mes…
$ `triple-negative status:ch1` <chr> "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN…
$ `tumor grade:ch1` <chr> NA, "Poorly Differentiated", "Poorly Differentiated", "Poorly D…
$ `tumor size:ch1` <chr> "2 - 5 cm", "<=2cm", "2 - 5 cm", "<=2cm", "2 - 5 cm", "<=2cm", …
mdata <- mdata %>%
select(title, contains("date"), geo_accession, contains(":ch1"))
colnames(mdata)
[1] "title" "submission_date"
[3] "last_update_date" "geo_accession"
[5] "age (years):ch1" "ajcc stage (7th edition, 2010):ch1"
[7] "body mass index:ch1" "er:ch1"
[9] "gender:ch1" "her2:ch1"
[11] "histology group:ch1" "histology:ch1"
[13] "menopausal status:ch1" "metastases:ch1"
[15] "positive nodes:ch1" "pr:ch1"
[17] "race:ch1" "set:ch1"
[19] "tissue:ch1" "tnbc subtype:ch1"
[21] "triple-negative status:ch1" "tumor grade:ch1"
[23] "tumor size:ch1"
cnames <- colnames(mdata)
cnames_processed <- str_split(cnames, pattern = ":") %>%
map_chr(~{.x[[1]]}) %>%
str_replace_all(" ", "_") %>%
str_replace_all("-", "_") %>%
str_remove_all("\\(|\\)|,")
cnames_processed
[1] "title" "submission_date" "last_update_date"
[4] "geo_accession" "age_years" "ajcc_stage_7th_edition_2010"
[7] "body_mass_index" "er" "gender"
[10] "her2" "histology_group" "histology"
[13] "menopausal_status" "metastases" "positive_nodes"
[16] "pr" "race" "set"
[19] "tissue" "tnbc_subtype" "triple_negative_status"
[22] "tumor_grade" "tumor_size"
colnames(mdata) <- cnames_processed
rm(cnames, cnames_processed)
glimpse(mdata)
Rows: 265
Columns: 23
$ title <chr> "S1-H10", "S1-H14", "S1-H19", "S1-H20B", "S1-H22", "S1-H27", "S1-H28", "…
$ submission_date <chr> "Dec 17 2015", "Dec 17 2015", "Dec 17 2015", "Dec 17 2015", "Dec 17 2015…
$ last_update_date <chr> "Dec 18 2015", "Dec 18 2015", "Dec 18 2015", "Dec 18 2015", "Dec 18 2015…
$ geo_accession <chr> "GSM1974566", "GSM1974567", "GSM1974568", "GSM1974569", "GSM1974570", "G…
$ age_years <dbl> NA, 41, 55, 55, 65, 40, 66, 57, NA, 55, 44, 78, 54, 71, 37, 51, 64, NA, …
$ ajcc_stage_7th_edition_2010 <chr> "T2N1M0", "T1N0M0", "T2N0M0", "T1cN0M0", "T2N2MX", "T1NXMX", "T1cNXMX", …
$ body_mass_index <dbl> 32, 29, NA, 31, 38, 22, 22, 38, 29, 32, 44, 28, 34, 32, 23, 23, 36, 22, …
$ er <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Negative", …
$ gender <chr> "Female", "Female", "Female", "Female", "Female", "Female", "Female", "F…
$ her2 <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Negative", …
$ histology_group <chr> "Infiltrating Ductal Carcinoma", "Infiltrating Ductal Carcinoma", "Infil…
$ histology <chr> "Infiltrating Ductal Carcinoma", "Infiltrating Ductal Carcinoma", "Infil…
$ menopausal_status <chr> "Post-Menopausal", "Post-Menopausal", NA, "Post-Menopausal", "Post-Menop…
$ metastases <chr> "No mets", "No mets", "No mets", "No mets", NA, NA, NA, NA, "No mets", N…
$ positive_nodes <chr> "1 - 3", "0", "0", "0", "4-9", NA, NA, "0", "0", "4-9", "0", "1 - 3", "1…
$ pr <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Negative", …
$ race <chr> "Caucasian", "Caucasian", "Caucasian", "Caucasian", "Caucasian", "Caucas…
$ set <chr> "Validation TN", "Validation TN", "Validation TN", "Validation TN", "Val…
$ tissue <chr> "Breast cancer", "Breast cancer", "Breast cancer", "Breast cancer", "Bre…
$ tnbc_subtype <chr> "Mesenchymal (MES)", "Basal-Like Immune-Activated (BLIA)", "Mesenchymal …
$ triple_negative_status <chr> "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", …
$ tumor_grade <chr> NA, "Poorly Differentiated", "Poorly Differentiated", "Poorly Differenti…
$ tumor_size <chr> "2 - 5 cm", "<=2cm", "2 - 5 cm", "<=2cm", "2 - 5 cm", "<=2cm", "<=2cm", …
# mdata %>%
# distinct(triple_negative_status, tnbc_subtype)
# distinct(submission_date, triple_negative_status)
# distinct(tnbc_subtype, triple_negative_status)
# distinct(histology, histology_group)
# distinct(er, her2, pr, triple_negative_status)
Looking at the number of samples for each combination of set and each condition.
mdata %>%
count(triple_negative_status, set)
Celfiles downloaded from GEO and kept the folder celfiles/
celFiles <- list.celfiles('celfiles/', full.names = TRUE, listGzipped = TRUE)
celFiles %>% head()
[1] "celfiles//GSM1974566_S1_H10.CEL.gz" "celfiles//GSM1974567_S1_H14.CEL.gz"
[3] "celfiles//GSM1974568_S1_H19.CEL.gz" "celfiles//GSM1974569_S1_H20B.CEL.gz"
[5] "celfiles//GSM1974570_S1_H22.CEL.gz" "celfiles//GSM1974571_S1_H27.CEL.gz"
names(celFiles) <- celFiles %>%
basename() %>%
str_split("\\.") %>%
map_chr(~{.x[1]}) %>%
str_split("_") %>%
map_chr(~{.x[1]})
head(celFiles)
GSM1974566 GSM1974567
"celfiles//GSM1974566_S1_H10.CEL.gz" "celfiles//GSM1974567_S1_H14.CEL.gz"
GSM1974568 GSM1974569
"celfiles//GSM1974568_S1_H19.CEL.gz" "celfiles//GSM1974569_S1_H20B.CEL.gz"
GSM1974570 GSM1974571
"celfiles//GSM1974570_S1_H22.CEL.gz" "celfiles//GSM1974571_S1_H27.CEL.gz"
head(mdata)
mdata <- mdata[match(mdata$geo_accession, names(celFiles)), ]
Getting only the relevant variables from the metadata.
mdata_subset <- mdata %>%
select(geo_accession,
title,
triple_negative_status,
tnbc_subtype,
submission_date,
er,
her2,
pr,
race,
set,
gender,
age_years) %>%
mutate(across(where(is.character), .fns = factor)) %>%
mutate(tnbc_subtype = if_else(is.na(as.character(tnbc_subtype)), "Not Applicable", as.character(tnbc_subtype))) %>%
mutate(tnbc_subtype = factor(tnbc_subtype)) %>%
as.data.frame()
rownames(mdata_subset) <- as.character(mdata_subset$geo_accession)
head(mdata_subset)
NA
# rawData <- read.celfiles(celFiles, phenoData = AnnotatedDataFrame(mdata_subset))
# saveRDS(object = rawData, "rawData.RDS")
rawData <- readRDS("rawData.RDS")
Looking at the dimensions of the raw expression matrix.
exprs(rawData) %>% dim()
[1] 1354896 265
res_1 <- rma(rawData)
Loading required package: RSQLite
Loading required package: DBI
Background correcting
Normalizing
Calculating Expression
# saveRDS(object = res_1, "res_1.RDS")
res_1 <- readRDS("res_1.RDS")
exprs(res_1) %>%
dim()
[1] 54675 265
exprs(res_1)[1:5, 1:5]
GSM1974566 GSM1974567 GSM1974568 GSM1974569 GSM1974570
1007_s_at 10.754163 11.361803 9.690693 10.157010 10.597699
1053_at 8.625663 9.011796 7.854072 7.925281 8.456781
117_at 7.333973 7.385199 7.466878 8.048563 7.581895
121_at 9.077887 8.782080 8.885189 8.775218 8.752407
1255_g_at 4.656331 4.635625 4.536114 4.626950 5.454820
Getting lists of the TNBC samples and the non-TNBC samples.
tnbc_samples <- mdata_subset %>%
filter(triple_negative_status == "TN") %>%
select(geo_accession) %>%
unlist(use.names = F) %>%
as.character()
head(tnbc_samples)
[1] "GSM1974566" "GSM1974567" "GSM1974568" "GSM1974569" "GSM1974570" "GSM1974571"
nontnbc_samples <- mdata_subset %>%
filter(triple_negative_status == "not TN") %>%
select(geo_accession) %>%
unlist(use.names = F) %>%
as.character()
head(nontnbc_samples)
[1] "GSM1978883" "GSM1978884" "GSM1978885" "GSM1978886" "GSM1978887" "GSM1978888"
Creating different metadata tables for TNBC and nonTNBC.
mdata_subset_tnbc <- mdata_subset[tnbc_samples, ]
dim(mdata_subset_tnbc)
[1] 198 12
mdata_subset_nontnbc <- mdata_subset[nontnbc_samples, ]
dim(mdata_subset_nontnbc)
[1] 67 12
Reading in the TNBC files.
# rawData_tnbc <- read.celfiles(filenames = celFiles[tnbc_samples],
# phenoData = AnnotatedDataFrame(mdata_subset_tnbc))
#
# rawData_tnbc
# saveRDS(rawData_tnbc, file = "rawData_tnbc.RDS")
rawData_tnbc <- readRDS(file = "rawData_tnbc.RDS")
rawData_tnbc
ExpressionFeatureSet (storageMode: lockedEnvironment)
assayData: 1354896 features, 198 samples
element names: exprs
protocolData
rowNames: GSM1974566 GSM1974567 ... GSM1974763 (198 total)
varLabels: exprs dates
varMetadata: labelDescription channel
phenoData
rowNames: GSM1974566 GSM1974567 ... GSM1974763 (198 total)
varLabels: geo_accession title ... age_years (11 total)
varMetadata: labelDescription channel
featureData: none
experimentData: use 'experimentData(object)'
Annotation: pd.hg.u133.plus.2
Loading required package: pd.hg.u133.plus.2
Loading required package: RSQLite
Loading required package: DBI
Reading in the nonTNBC files.
# rawData_nontnbc <- read.celfiles(filenames = celFiles[nontnbc_samples],
# phenoData = AnnotatedDataFrame(mdata_subset_nontnbc))
#
# rawData_nontnbc
# saveRDS(rawData_nontnbc, file = "rawData_nontnbc.RDS")
rawData_nontnbc <- readRDS(file = "rawData_nontnbc.RDS")
rawData_nontnbc
ExpressionFeatureSet (storageMode: lockedEnvironment)
assayData: 1354896 features, 67 samples
element names: exprs
protocolData
rowNames: GSM1978883 GSM1978884 ... GSM1978949 (67 total)
varLabels: exprs dates
varMetadata: labelDescription channel
phenoData
rowNames: GSM1978883 GSM1978884 ... GSM1978949 (67 total)
varLabels: geo_accession title ... age_years (11 total)
varMetadata: labelDescription channel
featureData: none
experimentData: use 'experimentData(object)'
Annotation: pd.hg.u133.plus.2
Performing RMA on TNBC data.
# res_tnbc <- rma(rawData_tnbc)
# saveRDS(res_tnbc, file = "res_tnbc.RDS")
res_tnbc <- readRDS(file = "res_tnbc.RDS")
Performing RMA on nonTNBC data.
# res_nontnbc <- rma(rawData_nontnbc)
# saveRDS(res_nontnbc, file = "res_nontnbc.RDS")
res_nontnbc <- readRDS(file = "res_nontnbc.RDS")
Combining the expression matrices of TNBC and nonTNBC data after separate RMA.
res_joint <- cbind(exprs(res_tnbc), exprs(res_nontnbc))
res_joint[1:5, 1:5]
GSM1974566 GSM1974567 GSM1974568 GSM1974569 GSM1974570
1007_s_at 10.703231 11.345325 9.732361 10.175704 10.576463
1053_at 8.605269 9.019553 7.794234 7.945029 8.497373
117_at 7.360884 7.373951 7.481344 8.047586 7.530427
121_at 9.100729 8.776369 8.860402 8.755606 8.766560
1255_g_at 4.664492 4.628692 4.569530 4.627230 5.467292
Saving the joint expression matrix from class-specific QN.
res_joint %>%
as_tibble(rownames = "probe_id") %>%
write_csv("dataframe_files/post_classQN_expression.csv")
Error: Cannot open file for writing:
* 'dataframe_files/post_classQN_expression.csv'
Saving a subset of the metadata that I think is relevant.
Saving the TNBC and nonTNBC metadata separately, just in case.
mdata_subset_nontnbc %>%
write_csv("dataframe_files/metadata_subset_nontnbc.csv")
res_1_df_long <- res_1 %>%
exprs() %>%
as_tibble(rownames = "probeID") %>%
pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id",
values_to = "intensity") %>%
left_join(., mdata_subset, by = c("sample_id" = "geo_accession"))
# saveRDS(object = res_1_df_long, "res_1_df_long.RDS")
res_1_df_long <- readRDS("res_1_df_long.RDS")
p2 <- res_1_df_long %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity,
color = set)) +
labs(x = "samples",
title = str_wrap("Sample-wise log2 intensity boxplots for whole QN")) +
scale_color_npg() +
theme(axis.text.x = element_blank())
p2
NA
ggsave("plots/exploration_plots/GSE76275_post_regQN_boxplots.png",
p2,
units = "cm", width = 30, height = 10)
rm(res_1_df_long)
res_joint_df_long <- res_joint %>%
as_tibble(rownames = "probeID") %>%
pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id",
values_to = "intensity")
# saveRDS(object = res_joint_df_long, "res_joint_df_long.RDS")
res_joint_df_long <- readRDS("res_joint_df_long.RDS")
p1 <- res_joint_df_long %>%
left_join(., mdata_subset, by = c("sample_id" = "geo_accession")) %>%
mutate(sample_id = factor(sample_id)) %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity,
color = set)) +
labs(x = "samples",
title = str_wrap("Sample-wise log2 intensity boxplots for classQN", 60)) +
scale_color_npg() +
theme(axis.text.x = element_blank())
p1
ggsave("plots/exploration_plots/GSE76275_post_classQN_boxplots.png",
p1,
units = "cm", width = 30, height = 10)
Function to create an annotated data frame by combining PC scores as well as metadata: useful for ggplot visualization.
get_pca_annot_df <- function(pca.obj, sample_id_col, mdata_df){
ind_scores <- pca.obj$x
ind_scores_reordered <- ind_scores[match(rownames(ind_scores), mdata_df[[sample_id_col]]), ] %>%
as_tibble(rownames = sample_id_col) %>%
mutate(filename = factor(!!sym(sample_id_col)))
ind_scores_annot <- left_join(ind_scores_reordered, y = mdata_df, by = sample_id_col) %>%
select(all_of(colnames(mdata_subset)), contains("PC"))
return(ind_scores_annot)
}
pca.res_1 <- res_1 %>%
exprs() %>%
t() %>%
prcomp(center = TRUE, scale = TRUE)
Getting the annotated data frame for the PCA.
pca.res_1.annot_df <- get_pca_annot_df(pca.obj = pca.res_1, sample_id_col = "geo_accession", mdata_df= mdata_subset)
head(pca.res_1.annot_df)
Looking at the variance explained by the first 10 PCs.
fviz_eig(pca.res_1) +
labs(x = "Principal Component",
title = str_wrap("Scree plot for the first 10 principal components for regular RMA-normalized data", 60)) +
theme(title = element_text(size = 15))
Superimposing variables in data upon sample PCA scores. The PCA does not seem to separate the TNBC and nonTNBC samples that well when regular RMA is performed.
ggplot(pca.res_1.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = triple_negative_status)) +
ggtitle("Samples in first two PCs, \ncoloured by triple_negative_status for whole QN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_wholeQN_TNBC_status.png")
Saving 7.29 x 4.51 in image
NA
The samples do not seem to separate well by set either.
ggplot(pca.res_1.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = set)) +
ggtitle("Samples in first two PCs, \ncoloured by set (discovery or validation) for whole QN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_wholeQN_set.png")
Saving 7.29 x 4.51 in image
There does not seem to be too strong of a batch effect according to submission date.
ggplot(pca.res_1.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = submission_date)) +
ggtitle("Samples in first two PCs, \ncoloured by submission date for whole QN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggsave("plots/exploration_plots/PCA_wholeQN_set.png")
Saving 7.29 x 4.51 in image
ggplot(pca.res_1.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = tnbc_subtype)) +
ggtitle("Samples in first two PCs, \ncoloured by tnbc_subtype for whole QN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5), legend.position = "right", legend.direction = "vertical", legend.key.width = unit(x = 0.5, units = "cm"))
pca.res_joint <- res_joint %>%
t() %>%
prcomp(center = TRUE, scale = TRUE)
saveRDS(pca.res_joint, "pca_res_joint.RDS")
# pca.res_joint <- readRDS("pca_res_joint.RDS")
Getting the annotated data frame for the PCA.
pca.res_joint.annot_df <- get_pca_annot_df(pca.obj = pca.res_joint, sample_id_col = "geo_accession", mdata_df= mdata_subset)
head(pca.res_joint.annot_df)
Looking at the variance explained by the first 10 PCs.
fviz_eig(pca.res_joint) +
labs(x = "Principal Component",
title = str_wrap("Scree plot for the first 10 principal components for classQN-normalized data", 60)) +
theme(title = element_text(size = 15))
Superimposing variables in data upon sample PCA scores. The PCA does separate the TNBC and nonTNBC samples well when class-specific RMA is performed.
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = triple_negative_status)) +
ggtitle("Samples in first two PCs, \ncoloured by triple_negative_status for classQN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
NA
The validation nonTNBC samples are separated from the discovery TNBC and validation TNBC samples.
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = set)) +
ggtitle("Samples in first two PCs, \ncoloured by set (discovery or validation) for classQN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
Submission date is perfectly confounded with TNBC status. May or may not be batch effects.
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = submission_date)) +
ggtitle("Samples in first two PCs, \ncoloured by submission date) for classQN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5))
ggplot(pca.res_joint.annot_df) +
geom_point(mapping = aes(x = PC1, y = PC2, colour = tnbc_subtype)) +
ggtitle("Samples in first two PCs, \ncoloured by tnbc_subtype for classQN") +
guides(colour = guide_legend(override.aes = list(size= 4))) +
theme(axis.text.x = element_text(angle = 90, size = 7),
title = element_text(size = 10),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black", size = 0.5), legend.position = "right", legend.direction = "vertical", legend.key.width = unit(x = 0.5, units = "cm"))
perform_min_max <- function(x){
mm_transformation <- preProcess(x, method = "range")
rescaled <- predict(mm_transformation, x)
return(rescaled)
}
Getting distances after performing min max normalization.
res_1_dists <- exprs(res_1) %>%
t() %>%
perform_min_max() %>%
dist(method = "euclidean")
# saveRDS(res_1_dists, "res_1_dists.RDS")
res_1_dists <- readRDS("res_1_dists.RDS")
res_joint_dists <- res_joint %>%
t() %>%
perform_min_max() %>%
dist(method = "euclidean")
# saveRDS(res_joint_dists, "res_joint_dists.RDS")
res_joint_dists <- readRDS("res_joint_dists.RDS")
res_1_dend <- res_1_dists %>%
hclust() %>%
as.dendrogram()
res_joint_dend <- res_joint_dists %>%
hclust() %>%
as.dendrogram()
library(dendextend)
---------------------
Welcome to dendextend version 1.15.2
Type citation('dendextend') for how to cite the package.
Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/
Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
You may ask questions at stackoverflow, use the r and dendextend tags:
https://stackoverflow.com/questions/tagged/dendextend
To suppress this message use: suppressPackageStartupMessages(library(dendextend))
---------------------
Attaching package: ‘dendextend’
The following object is masked from ‘package:Biostrings’:
nnodes
The following object is masked from ‘package:stats’:
cutree
# res_1_dend %>%
# labels()
# res_1_dend %>%
# order.dendrogram()
# (res_1 %>%
# exprs() %>%
# colnames())[28]
res_1_dend_laborder <- res_1_dend %>%
labels()
mycolors <- ifelse(mdata_subset[res_1_dend_laborder, ]$triple_negative_status == "TN", "forestgreen", "maroon")
par(mar = c(10,2,1,1))
res_1_dend %>%
set("labels_cex", 0.1) %>%
plot()
colored_bars(colors = mycolors, dend = res_1_dend, rowLabels = "TN Status", add = TRUE)
res_joint_dend_laborder <- res_joint_dend %>%
labels()
mycolors <- ifelse(mdata_subset[res_joint_dend_laborder, ]$triple_negative_status == "TN", "forestgreen", "maroon")
par(mar = c(10,2,1,1))
res_joint_dend %>%
set("labels_cex", 0.1) %>%
plot()
colored_bars(colors = mycolors, dend = res_joint_dend, rowLabels = "TN Status", add = TRUE)
Function to process distance object into a distance matrix for heatmap visualization.
get_distmat <- function(x){
distmat <- as.matrix(x)
colnames(distmat) <- NULL
diag(distmat) <- NA
return(distmat)
}
row_annot <- mdata_subset %>%
select(set, submission_date, tnbc_subtype)
head(row_annot)
set.seed(1)
row_colours <- list( "set" = c("steelblue", "maroon", "gold"),
"submission_date" = sample(colorRampPalette(colors = brewer.pal(n = 8, name = "Set2"))(8), 2),
"tnbc_subtype" = sample(colorRampPalette(colors = brewer.pal(n = 8, name = "Dark2"))(8), 5)
)
names(row_colours$set) <- as.character(unique(mdata_subset$set))
names(row_colours$submission_date) <- as.character(unique(mdata_subset$submission_date))
names(row_colours$tnbc_subtype) <- as.character(unique(mdata_subset$tnbc_subtype))
str(row_colours)
List of 3
$ set : Named chr [1:3] "steelblue" "maroon" "gold"
..- attr(*, "names")= chr [1:3] "Validation TN" "Discovery TN" "Validation not TN"
$ submission_date: Named chr [1:2] "#66C2A5" "#E78AC3"
..- attr(*, "names")= chr [1:2] "Dec 17 2015" "Dec 22 2015"
$ tnbc_subtype : Named chr [1:5] "#A6761D" "#1B9E77" "#D95F02" "#66A61E" ...
..- attr(*, "names")= chr [1:5] "Mesenchymal (MES)" "Basal-Like Immune-Activated (BLIA)" "Basal-Like Immune-Suppressed (BLIS)" "Luminal-AR (LAR)" ...
res_1_dists %>%
get_distmat() %>%
pheatmap(.,
annotation_row = row_annot,
annotation_colors = row_colours,
show_colnames = F,
show_rownames = F,
cutree_rows = 3,
main = str_wrap("Heatmap of sample distances for whole QN expression matrix", 60),
legend_labels = c("small distance", "large distance"),
legend_breaks = c(min(., na.rm = TRUE),
max(., na.rm = TRUE)))
res_joint_dists %>%
get_distmat() %>%
pheatmap(.,
annotation_row = row_annot,
annotation_colors = row_colours,
show_colnames = F,
show_rownames = F,
cutree_rows = 2,
cutree_cols = 2,
main = str_wrap("Heatmap of sample distances for class-specific QN expression matrix", 60),
legend_labels = c("small distance", "large distance"),
legend_breaks = c(min(., na.rm = TRUE),
max(., na.rm = TRUE)))
full_mod <- mdata_subset %>%
select(geo_accession, triple_negative_status) %>%
arrange(triple_negative_status) %>%
model.matrix(~triple_negative_status, data = .)
head(full_mod)
(Intercept) triple_negative_statusTN
GSM1978883 1 0
GSM1978884 1 0
GSM1978885 1 0
GSM1978886 1 0
GSM1978887 1 0
GSM1978888 1 0
red_mod <- model.matrix(~1, data = mdata_subset)
head(red_mod)
(Intercept)
GSM1974566 1
GSM1974567 1
GSM1974568 1
GSM1974569 1
GSM1974570 1
GSM1974571 1
Get number of significant surrogate variables.
n.sv.wholeQN <- num.sv(exprs(res_1), full_mod, method="leek")
n.sv.wholeQN
[1] 0
svobj.wholeQN <- sva(exprs(res_1), mod = full_mod, mod0 = red_mod, n.sv = 1)
Number of significant surrogate variables is: 1
Iteration (out of 5 ):1 2 3 4 5
left_join(sv_df.wholeQN, mdata, by = "geo_accession") %>%
mutate(index = 5) %>%
ggplot() +
# geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
theme_light() +
labs(y = "Surrogate Variable Value", title = "Distribution of latent variable estimated by SVA for different grouping factors")
# ggsave("plots/exploration_plots/sva_grouping_normalRMA.png")
Create full model matrix.
full_mod <- mdata_subset %>%
select(geo_accession, triple_negative_status) %>%
arrange(triple_negative_status) %>%
model.matrix(~triple_negative_status, data = .)
head(full_mod)
(Intercept) triple_negative_statusTN
GSM1978883 1 0
GSM1978884 1 0
GSM1978885 1 0
GSM1978886 1 0
GSM1978887 1 0
GSM1978888 1 0
Create reduced model matrix.
red_mod <- model.matrix(~1, data = mdata_subset)
head(red_mod)
(Intercept)
GSM1974566 1
GSM1974567 1
GSM1974568 1
GSM1974569 1
GSM1974570 1
GSM1974571 1
Get number of significant surrogate variables.
n.sv.classQN <- num.sv(res_joint, full_mod, method="leek")
n.sv.classQN
[1] 1
Perform SVA on classQN-normalized expression matrix.
svobj.classQN <- sva(res_joint, mod = full_mod, mod0 = red_mod, n.sv = n.sv.classQN)
Number of significant surrogate variables is: 1
Iteration (out of 5 ):1 2 3 4 5
sv_df.classQN <- tibble("geo_accession" = colnames(res_joint), "sv" = svobj.classQN$sv)
head(sv_df.classQN)
saveRDS(sv_df.classQN, "sv_df_classQN.RDS")
# sv_df.classQN <- readRDS("sv_df_classQN.RDS")
left_join(sv_df.classQN , mdata, by = "geo_accession") %>%
mutate(index = 5) %>%
ggplot() +
# geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
theme_light() +
labs(y = "Surrogate Variable Value", title = "Distribution of latent variable estimated by SVA for different grouping factors")
ggsave("plots/exploration_plots/sva_grouping_classQN.png")
Saving 7.29 x 4.51 in image
In this attempt, I perform no quantile normalization while performing RMA. If QN has not been performed and a surrogate variable shows up that corresponds to batch, batch effects are probably present.
rawData.summary <- rma(rawData, background = TRUE, normalize = FALSE)
Background correcting
Calculating Expression
rawData.summary_df_long <- rawData.summary %>%
exprs() %>%
as_tibble(rownames = "probeID") %>%
pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id",
values_to = "intensity") %>%
left_join(., mdata_subset, by = c("sample_id" = "geo_accession"))
p3 <- rawData.summary_df_long %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity,
color = set)) +
labs(x = "samples",
title = str_wrap("Sample-wise log2 intensity boxplots in the absence of QN", 60)) +
scale_color_npg() +
theme(axis.text.x = element_blank())
p3
ggsave("plots/exploration_plots/GSE76275_noQN_boxplots.png",
p1,
units = "cm", width = 30, height = 10)
Getting the number of surrogate variables in the absence of quantile normalization.
n.sv.nonorm <- num.sv(exprs(rawData.summary), full_mod, method="leek")
There is one surrogate variable present in the absence of QN.
n.sv.nonorm
[1] 1
svobj.nonorm <- sva(exprs(rawData.summary), mod = full_mod, mod0 = red_mod, n.sv = n.sv.nonorm)
Number of significant surrogate variables is: 1
Iteration (out of 5 ):1 2 3 4 5
sv_df.nonorm <- tibble("geo_accession" = colnames(exprs(rawData.summary)), "sv" = svobj.nonorm$sv)
head(sv_df.nonorm)
left_join(sv_df.nonorm, mdata, by = "geo_accession") %>%
mutate(index = 5) %>%
ggplot() +
# geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
theme_light() +
labs(y = "Surrogate Variable Value",
title = str_wrap("Distribution of latent variable estimated by SVA for different grouping factors", 60))
ggsave("plots/exploration_plots/sva_grouping_noQN.png")
Saving 7.29 x 4.51 in image